6 Diagnostics and Analysis

6.1 Investigate high variance wells

nrow.source = function(df, facilityname, sourcename) {
  stopifnot(length(sourcename)==1)
  return(nrow(df %>% filter(well==facilityname, source==sourcename)))
}
well_summaries = outframe %>%
  filter(facility %in% well_names, variable=="mf_pred") %>%
  group_by(facility) %>%
  summarise(mean = mean(value),
            sd = sd(value),
            n_test = nrow.source(regression_df, unique(facility),"Well Tests"),
            n_pi = nrow.source(regression_df, unique(facility), "PI Database"),
            use.test = ifelse(n_test>0, "Test data", "No test data"),
            use.pi = ifelse(n_pi>0, "PI data", "No PI data")) %>%
  arrange(desc(sd))
well_summaries$production.curve = ifelse(well_summaries$facility %in% liq_wells, "Production curve", "Time series")

fp_summaries = list(fp14 = well_summaries %>% filter(facility %in% flows_to(censor('fp14', 'fp'))),
                    fp15 = well_summaries %>% filter(facility %in% flows_to(censor('fp15', 'fp'))),
                    fp16 = well_summaries %>% filter(facility %in% flows_to(censor('fp16', 'fp'))))
for (fp in names(fp_summaries)) {
  print(xtable(fp_summaries[[fp]] %>% select(-c(use.test, use.pi, production.curve)),
               type = "latex",
               caption=paste("Data methods feeding flash plant", censor(fp, 'fp')),
               label=paste0("tab:well_summaries_", fp)),
      table.placement = "H",
      file = paste0("../_media/summaries_", fp, ".tex"))
}

n_summaries = well_summaries %>%
  group_by(use.pi, use.test) %>%
  count()

sourceplot = ggplot(well_summaries, aes(x=1, y=log(sd))) +
  geom_boxplot(fill='steelblue') +
  geom_label(data=n_summaries, aes(x=-Inf, y=-Inf, hjust=0, vjust=0, label=paste0("n=", n), family="Times New Roman"), label.size=0, fill='white') +
  facet_grid(.~ use.pi + use.test) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  labs(title="Differences in Production Error by Data Source", x="Production curve data source", y="log(standard deviation)")# +
  # ggsave('../_media/error_source.png', width=24.7*0.5, height=6, units='cm')
ggplotly(sourceplot)
sourcetab = well_summaries %>% select(facility, mean, sd, n_test, n_pi)
# print(xtable(sourcetab %>% head(), type = "latex",
#              caption="Upon inspection of the wells with the most variance, there is no immediate cause for high variance. This requires further investigation.",
#              label="tab:well_summaries"),
#       table.placement = "h",
#       file = "../_media/well_summaries.tex")
kable(cbind(sourcetab)) %>%
  kable_styling() %>%
  scroll_box(height = "400px")
facility mean sd n_test n_pi
wk65 102.2934749 109.2036524 2 0
wk86 70.9098346 62.1805718 4 0
wk28 88.4427565 51.5255323 26 0
wk242 320.7698705 47.5436728 28 0
wk27 167.8148213 35.4230481 36 0
wk59 112.4056454 28.7084217 31 0
wk81 62.0394664 26.9696698 24 0
wk26b 111.2556339 26.1285785 13 0
wk116 78.2828876 25.9934521 10 0
wk123 183.5375107 23.1481245 31 0
wk76 123.1079479 22.4739026 31 0
wk96 28.7114853 22.1074050 10 0
wk72 153.2740928 20.2675648 26 0
wk71 130.5979298 20.2435489 32 0
wk67 93.0195776 19.3605826 26 0
wk256 218.0857737 18.6133809 32 30
wk268 88.5181131 17.7488203 27 30
wk266 324.7101171 16.9025924 29 30
wk264 394.3901555 15.7904385 34 30
wk245 420.7193707 15.3968544 41 30
wk26a 118.4680639 14.4939735 16 0
wk265 334.3693427 13.7743788 34 30
wk267 306.3311541 13.7699431 31 30
wk74 155.9695681 13.7300618 26 0
wk83 133.4440544 13.4643671 26 0
wk255 378.6293910 13.4585292 41 30
wk250 23.7800237 12.8711906 0 30
wk269 133.3360082 11.7048139 25 30
wk55 63.4294064 10.9022814 47 0
wk235 205.8764431 10.3997032 20 0
wk262 632.6696669 9.9788955 36 30
wk70 150.3092709 9.8630844 45 0
wk244 196.6758263 9.6261239 37 30
wk229 202.3362270 9.3875046 92 0
wk124 253.0930871 9.1417281 31 0
wk222 107.4783230 8.2663053 44 0
wk243 374.5543382 8.0256282 52 30
wk207 150.9524750 6.4657032 18 0
wk247 233.2013668 6.4012979 48 30
wk271 86.3065011 6.3809900 6 30
wk239 132.9953467 5.7918576 18 0
wk46 43.9109434 5.2722978 18 0
wk260 295.9067337 4.8348543 33 30
wk261 250.1655260 4.7207026 37 30
wk253 269.6862272 4.5556241 32 30
wk270 461.4276278 4.0426321 5 30
wk258 198.7028243 3.5453950 38 30
wk259 309.1449514 3.2888729 37 30
wk263 221.5213149 3.2590841 34 30
wk272 207.6220684 2.8044848 7 30
wk254 226.6584657 2.4440172 39 30
wk605 35.3459877 1.3452568 0 0
wk237 14.1962301 1.0528268 0 30
wk606 22.1277314 0.8462717 0 0
wk233 13.1344091 0.5505461 0 30
wk241 45.8576241 0.5211013 0 30
wk238 60.1674907 0.3661648 0 30
wk607 1.4918209 0.3595182 0 0
wk249 1.2393797 0.3220994 0 0
wk610 6.7519598 0.3090580 0 0
wk251 22.7806092 0.2287621 0 30
wk234 30.8827687 0.1998563 0 30
wk25 6.7878732 0.1874126 0 0
wk240 23.5225646 0.1584181 0 30
wk232 0.1900650 0.1415838 0 30
wk228 15.0023497 0.1067671 0 30
wk236 26.3313337 0.0811007 0 0
wk216 8.1389004 0.0778926 0 0
wk252 58.9341342 0.0641691 0 30
wk118 9.5062240 0.0327287 0 0
wk604 0.7711713 0.0280654 0 0
wk66 0.0008232 0.0008350 0 0
wk92 0.0003939 0.0003999 0 0
wk101 0.0003719 0.0003774 0 0
wk88 0.0000000 0.0000000 0 0

6.2 Regression Fits

prod = as.data.frame(outmatrix) %>%
  select(contains('prod')) %>%
  gather(key=facility, value=value) %>%
  mutate(which=parse_number(facility)) %>%
  mutate(whp=data$whp_prod[which],
         well = names(ids)[data$well_id_prod[which]]) %>%
  rename(mf=value) %>%
  group_by(well, whp) %>%
  summarise(lower=quantile(mf, 0.025),
            upper=quantile(mf, 0.975),
            mean=mean(mf))

plotdata = regression_df %>%
  filter(well_id %in% ids[production_curve_wells]) %>%
  mutate(datetime = factor(as.Date(date))) %>%
  mutate(source = factor(source, levels=c("Well Tests", "PI Database")))

# regression plot
regplot = ggplot(prod, aes(x=whp)) +
  geom_line(aes(y=mean, color=well)) +
  geom_ribbon(aes(ymin=lower, ymax=upper, fill=well), alpha=0.25) +
  geom_point(data=plotdata, aes(y=mf, color=well, size=date, shape=source), alpha=0.5) +
  labs(title="Linear Regression on Test and PI Data", x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Well", shape="Data source", size="Date", fill="Well") +
  coord_cartesian(xlim=c(min(plotdata$whp)*0.9,max(plotdata$whp)*1.1), ylim=c(0,max(plotdata$mf)*1.1))# +
  # ggsave('../_media/production_curve.png', width=24.7*0.48, height=24.7*0.48, units='cm')
ggplotly(regplot)

6.3 Time Series Plots

tsplotwells = ar_wells
ts_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_ts')) %>%
  gather() %>%
  mutate(index = parse_number(key)) %>% select(-key) %>%
  group_by(index) %>%
  summarise(lower=quantile(value, 0.025),
            upper=quantile(value, 0.975),
            mean=mean(value)) %>%
  cbind(ts) %>%
  mutate(well = factor(names(ids[well_id_ts])),
         date_numeric = date_numeric_ts)

# actual observations
tsplotdata = dry_df %>%
  filter(well_id %in% ids[tsplotwells]) %>%
  mutate(datetime = factor(as.Date(date)),
         facility = well)

# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ar")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# experimental EMA time series
ewma_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ema")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# find plot limits
tsmax = max(c(ts_fit$upper, ar_fit$upper, ewma_fit$upper))

lintsplot = ggplot(ts_fit, aes(x=date_numeric, color=well, fill=well)) +
  geom_line(aes(y=mean), linetype="dashed") +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.25) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  coord_cartesian(ylim=c(0, 60)) +
  labs(title=paste("Linear Time Series Regression for Selected Wells in PI"), x="Days since baseline (2000)", linetype="")# +
  # ggsave('../_media/dry_time_series.png', width=24.7, height=8, units='cm')

arplot = ggplot(ar_fit %>% filter(facility %in% tsplotwells), aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
  ggsave('../_media/ar_experiment.png', width=24.7, height=8, units='cm')

ewmaplot = ggplot(ewma_fit, aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="EWMA Experiment", x="Days since first date")# +
  # ggsave('../_media/ewma_experiment.png', width=24.7, height=8, units='cm')

ggplotly(lintsplot)
ggplotly(arplot)
ggplotly(ewmaplot)
tsgrob = grid_arrange_shared_legend(lintsplot, arplot, ewmaplot, nrow=3, ncol=1, position = "bottom")

tsgrob
## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name              grob
## 1 1 (1-1,1-1) arrange   gtable[arrange]
## 2 2 (2-2,1-1) arrange gtable[guide-box]
# ggsave('../_media/ts_experiment.png', tsgrob, width=24.7, height=24, units='cm')

6.4 Goodness of fit (OLS regression)

liq_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_fit')) %>%
  gather(key='index', value='fitted') %>%
  mutate(index=as.integer(parse_number(index))) %>%
  group_by(index) %>%
  summarise(lower=quantile(fitted, 0.025),
            upper=quantile(fitted, 0.975),
            Fitted=mean(fitted),
            std=sd(fitted)) %>%
  cbind(regression_df) %>%
  mutate(`Standardised residual` = (Fitted-mf)/std,
         Well = factor(names(ids[well_id])),
         Observed = mf) %>%
  gather(key="key", value="value", `Standardised residual`, Observed) %>%
  select(Well, key, Fitted, value, source)

diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
  geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = rep_len(1:25, length(unique(liq_fit$Well)))) +
  geom_smooth(color='black') +
  facet_wrap(~key, scales="free") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  # coord_cartesian(ylim=c(-4, 4)) +
  labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
  theme(legend.position = "bottom") +
  guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T))# +
  # ggsave('../_media/diagnostics.png', width=24.7, height=12, units='cm')
ggplotly(diagplot)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
selectwells = liq_fit %>% group_by(Well, key) %>% summarise(fittedsd = sd(Fitted)) %>%
  arrange(desc(fittedsd)) %>% head(56*2) %>% pull(Well)

observedplot = ggplot(liq_fit %>% filter(key=="Observed", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free") +
  # geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b)) +
  labs(title="Linear Regression Fit Plots Per Well", x="Fitted mass flow (T/h)", y="Observed mass flow (T/h)", color="Data source") +
  theme(legend.position = "bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/observed.png', width=24.7, height=24.7, units='cm')

stdresplot = ggplot(liq_fit %>% filter(key=="Standardised residual", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free_x") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  # geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  labs(title="Linear Regression Residual Plots Per Well", x="Fitted mass flow (T/h)", y="Standardised residual", color="Data source") +
  coord_cartesian(ylim=c(-5, 5)) + theme(legend.position="bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/stdres.png', width=24.7, height=24.7, units='cm')

ggplotly(observedplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(stdresplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
# stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
# ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
#   geom_density(fill="red", alpha=0.5, color=NA) +
#   geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))

6.5 Limits and Constraint Violations

sf.df <- outframe %>% 
  filter(str_detect(variable, "total_sf") & value > 0) %>% 
  droplevels()
limits = fp_constants %>%
  mutate(facility = names(ids)[fp_id]) %>%
  select(facility, limit) %>% 
  drop_na()

p.limits = sf.df %>%
  left_join(limits, by=c("facility")) %>%
  mutate(greater = value > limit) %>%
  group_by(facility) %>%
  summarise(p.greater = mean(greater)) %>%
  drop_na()

limitplot = ggplot(sf.df %>% filter(facility %ni% incomplete.fps), aes(x=value, fill=facility)) +
  facet_wrap(~facility, scales = "free_y", ncol=2) +
  geom_density(alpha=0.5, color=NA) +
  geom_vline(data=limits, aes(xintercept=limit), color="red") +
  geom_label(data=p.limits %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>lim)=", p.greater), family="Times New Roman"), color="black", label.size=0, fill='white') +
  theme(legend.position="none",
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit")# +
  # ggsave('../_media/constraints.png', width=24.7, height=10, units='cm')
ggplotly(limitplot)

6.6 Flow Comparison

flow.df <- outframe %>% 
  filter(facility %in% fp_names) %>%
  filter(str_detect(variable, "mf_pred|ip_sf|lp_sf|wf") & value > 0) %>%
  mutate(variable=ifelse(variable=="mf_pred", "mf", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf")))

comparison = fp_constants %>% select("fp", contains("verification")) %>%
  rename(facility=fp) %>%
  gather(key="variable", value="value", -facility) %>%
  mutate(variable = gsub("^verification_", "", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

ps = flow.df %>%
  left_join(comparison, by=c("facility", "variable")) %>%
  mutate(greater = value.x > value.y) %>%
  group_by(facility, variable) %>%
  summarise(p.greater = mean(greater)) %>%
  mutate(variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

verificationplot = ggplot(flow.df %>% filter(facility %ni% incomplete.fps), aes(x=value)) +
  geom_density(aes(y=..scaled.., fill=variable, color=variable), alpha=0.5, show.legend=F) +
  geom_vline(data=comparison %>% filter(facility %ni% incomplete.fps), aes(xintercept=value)) +
  geom_label(data=ps %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>x)=", p.greater), family="Times New Roman"), label.size=0) +
  facet_grid(facility~variable, scales="free", space="free_y") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(x="Value", y="Scaled density", title="Comparison Between Predicted FP Flows and Sample Data")# +
  # ggsave('../_media/verification.png', width=24.7, height=20, units='cm')
ggplotly(verificationplot)